Enter 2006 September

home *** CD-ROM | disk | FTP | other *** search

/ Enter 2006 September / Enter 09 2006.iso / Internet / SpamExperts Home 1.1 / SpamExperts Home.exe / lib / spamexperts.modules / spambayes / message.pyc (.txt) < prev next >

Wrap

Python Compiled Bytecode | 2006-07-14 | 25.1 KB | 760 lines

# Source Generated with Decompyle++ # File: in.pyc (Python 2.4) """message.py - Core Spambayes classes. Classes: Message - an email.Message.Message, extended with spambayes methods SBHeaderMessage - A Message with spambayes header manipulations MessageInfoDB - persistent state storage for Message, using dbm MessageInfoZODB - persistent state storage for Message, using ZODB MessageInfoPickle - persistent state storage for Message, using pickle Abstract: MessageInfoDB is a simple shelve persistency class for the persistent state of a Message obect. The MessageInfoDB currently does not provide iterators, but should at some point. This would allow us to, for example, see how many messages have been trained differently than their classification, for fp/fn assessment purposes. Message is an extension of the email package Message class, to include persistent message information. The persistent state currently consists of the message id, its current classification, and its current training. The payload is not persisted. SBHeaderMessage extends Message to include spambayes header specific manipulations. Usage: A typical classification usage pattern would be something like: >>> import email >>> # substance comes from somewhere else >>> msg = email.message_from_string(substance, _class=SBHeaderMessage) >>> id = msg.setIdFromPayload() >>> if id is None: >>> msg.setId(time()) # or some unique identifier >>> msg.delSBHeaders() # never include sb headers in a classification >>> # bayes object is your responsibility >>> (prob, clues) = bayes.spamprob(msg.asTokens(), evidence=True) >>> msg.addSBHeaders(prob, clues) A typical usage pattern to train as spam would be something like: >>> import email >>> # substance comes from somewhere else >>> msg = email.message_from_string(substance, _class=SBHeaderMessage) >>> id = msg.setId(msgid) # id is a fname, outlook msg id, something... >>> msg.delSBHeaders() # never include sb headers in a train >>> if msg.getTraining() == False: # could be None, can't do boolean test >>> bayes.unlearn(msg.asTokens(), False) # untrain the ham >>> bayes.learn(msg.asTokens(), True) # train as spam >>> msg.rememberTraining(True) To Do: o Suggestions? """ from __future__ import generators __author__ = 'Tim Stone <tim@fourstonesExpressions.com>' __credits__ = 'Mark Hammond, Tony Meyer, all the spambayes contributors.' try: (True, False) except NameError: (True, False) = (1, 0) def bool(val): return not (not val) import os import sys import types import time import math import re import errno import shelve import warnings try: import cPickle as pickle except ImportError: import pickle import traceback import email import email.Message as email import email.Parser as email import email.Header as email import email.Generator as email from spambayes import storage from spambayes import dbmstorage from spambayes.Options import options, get_pathname_option from spambayes.tokenizer import tokenize try: import cStringIO as StringIO except ImportError: import StringIO CRLF_RE = re.compile('\\r\\n|\\r|\\n') STATS_START_KEY = 'Statistics start date' PERSISTENT_HAM_STRING = 'h' PERSISTENT_SPAM_STRING = 's' PERSISTENT_UNSURE_STRING = 'u' class MessageInfoBase(object): def __init__(self, db_name = None): self.db_name = db_name def __len__(self): return len(self.keys()) def get_statistics_start_date(self): if self.db.has_key(STATS_START_KEY): return self.db[STATS_START_KEY] else: return None def set_statistics_start_date(self, date): self.db[STATS_START_KEY] = date self.store() def __getstate__(self): return self.db def __setstate__(self, state): self.db = state def load_msg(self, msg): if self.db is not None: key = msg.getDBKey() if not key is not None: raise AssertionError, 'None is not a valid key.' try: try: attributes = self.db[key] except pickle.UnpicklingError: if hasattr(self, 'dbm'): attributes = self.dbm[key] else: raise except: hasattr(self, 'dbm') except KeyError: for att in msg.stored_attributes: if not hasattr(msg, att): setattr(msg, att, None) continue if not isinstance(attributes, types.ListType): if isinstance(attributes, types.TupleType): (msg.c, msg.t) = attributes return None elif isinstance(attributes, types.StringTypes): msg.t = { '0': False, '1': True }[attributes] return None else: print >>sys.stderr, 'Unknown message info type', attributes sys.exit(1) for att, val in attributes: setattr(msg, att, val) def store_msg(self, msg): if self.db is not None: msg.date_modified = time.time() attributes = [] for att in msg.stored_attributes: attributes.append((att, getattr(msg, att))) key = msg.getDBKey() if not key is not None: raise AssertionError, 'None is not a valid key.' self.db[key] = attributes self.store() def remove_msg(self, msg): if self.db is not None: del self.db[msg.getDBKey()] self.store() def keys(self): return self.db.keys() class MessageInfoPickle(MessageInfoBase): def __init__(self, db_name, pickle_type = 1): MessageInfoBase.__init__(self, db_name) self.mode = pickle_type self.load() def load(self): try: fp = open(self.db_name, 'rb') except IOError: e = None if e.errno == errno.ENOENT: self.db = { } else: raise except: e.errno == errno.ENOENT self.db = pickle.load(fp) fp.close() def close(self): pass def store(self): fp = open(self.db_name, 'wb') pickle.dump(self.db, fp, self.mode) fp.close() class MessageInfoDB(MessageInfoBase): def __init__(self, db_name, mode = 'c'): MessageInfoBase.__init__(self, db_name) self.mode = mode self.load() def load(self): try: self.dbm = dbmstorage.open(self.db_name, self.mode) self.db = shelve.Shelf(self.dbm) except dbmstorage.error: if options[('globals', 'verbose')]: print 'Warning: no dbm modules available for MessageInfoDB' self.dbm = None self.db = None def __del__(self): self.close() def close(self): def noop(): pass getattr(self.db, 'close', noop)() getattr(self.dbm, 'close', noop)() def store(self): if self.db is not None: self.db.sync() try: from persistent import Persistent except ImportError: Persistent = object class _PersistentMessageInfo(MessageInfoBase, Persistent): def __init__(self): import ZODB OOBTree = OOBTree import BTrees.OOBTree MessageInfoBase.__init__(self) self.db = OOBTree() class MessageInfoZODB(storage.ZODBClassifier): ClassifierClass = _PersistentMessageInfo def __init__(self, db_name, mode = 'c'): self.nham = self.nspam = 0 storage.ZODBClassifier.__init__(self, db_name, mode) self.classifier.store = self.store self.db = self.classifier def __setattr__(self, att, value): object.__setattr__(self, att, value) _storage_types = { 'dbm': (MessageInfoDB, True, True), 'pickle': (MessageInfoPickle, False, True), 'zodb': (MessageInfoZODB, True, True) } def open_storage(data_source_name, db_type = 'dbm', mode = None): '''Return a storage object appropriate to the given parameters.''' try: (klass, supports_mode, unused) = _storage_types[db_type] except KeyError: raise storage.NoSuchClassifierError(db_type) if supports_mode and mode is not None: return klass(data_source_name, mode) else: return klass(data_source_name) def database_type(): dn = ('Storage', 'messageinfo_storage_file') (nm, typ) = storage.database_type((), default_name = dn) if typ not in _storage_types.keys(): typ = 'pickle' return (nm, typ) class Message(object, email.Message.Message): '''An email.Message.Message extended for SpamBayes''' def __init__(self, id = None): email.Message.Message.__init__(self) self.stored_attributes = [ 'c', 't', 'date_modified'] self.getDBKey = self.getId self.id = None self.c = None self.t = None self.date_modified = None if id is not None: self.setId(id) _message_info_db = None def _get_class_message_info_db(klass): if klass._message_info_db is None: (nm, typ) = database_type() klass._message_info_db = open_storage(nm, typ) return klass._message_info_db _get_class_message_info_db = classmethod(_get_class_message_info_db) def _set_class_message_info_db(klass, value): klass._message_info_db = value _set_class_message_info_db = classmethod(_set_class_message_info_db) def _get_message_info_db(self): return self._get_class_message_info_db() def _set_message_info_db(self, value): self._set_class_message_info_db(value) message_info_db = property(_get_message_info_db, _set_message_info_db) def setPayload(self, payload): """DEPRECATED. This function does not work (as a result of using private methods in a hackish way) in Python 2.4, so is now deprecated. Use *_from_string as described above. More: Python 2.4 has a new email package, and the private functions are gone. So this won't even work. We have to do something to get this to work, for the 1.0.x branch, so use a different ugly hack. """ warnings.warn('setPayload is deprecated. Use email.message_from_string(payload, _class=Message) instead.', DeprecationWarning, 2) new_me = email.message_from_string(payload, _class = Message) self.__dict__.update(new_me.__dict__) def setId(self, id): if self.id and self.id != id: raise ValueError, 'MsgId has already been set, cannot be changed' + `self.id` + `id` if id is None: raise ValueError, 'MsgId must not be None' if type(id) not in types.StringTypes: raise TypeError, 'Id must be a string' if id == STATS_START_KEY: raise ValueError, 'MsgId must not be' + STATS_START_KEY self.id = id self.message_info_db.load_msg(self) def getId(self): return self.id def tokenize(self): return tokenize(self) def _force_CRLF(self, data): '''Make sure data uses CRLF for line termination.''' return CRLF_RE.sub('\r\n', data) def as_string(self, unixfrom = False, mangle_from_ = True): try: fp = StringIO.StringIO() g = email.Generator.Generator(fp, mangle_from_ = mangle_from_) g.flatten(self, unixfrom) return self._force_CRLF(fp.getvalue()) except TypeError: parts = [] for part in self.get_payload(): parts.append(email.Message.Message.as_string(part, unixfrom)) return self._force_CRLF('\n'.join(parts)) def modified(self): if self.id: self.message_info_db.store_msg(self) def GetClassification(self): if self.c == PERSISTENT_SPAM_STRING: return options[('Headers', 'header_spam_string')] elif self.c == PERSISTENT_HAM_STRING: return options[('Headers', 'header_ham_string')] elif self.c == PERSISTENT_UNSURE_STRING: return options[('Headers', 'header_unsure_string')] def RememberClassification(self, cls): if cls == options[('Headers', 'header_spam_string')]: self.c = PERSISTENT_SPAM_STRING elif cls == options[('Headers', 'header_ham_string')]: self.c = PERSISTENT_HAM_STRING elif cls == options[('Headers', 'header_unsure_string')]: self.c = PERSISTENT_UNSURE_STRING else: raise ValueError, 'Classification must match header strings in options' self.modified() def GetTrained(self): return self.t def RememberTrained(self, isSpam): self.t = isSpam self.modified() def __repr__(self): return 'spambayes.message.Message%r' % repr(self.__getstate__()) def __getstate__(self): return (self.id, self.c, self.t) def __setstate__(self, t): (self.id, self.c, self.t) = t class SBHeaderMessage(Message): '''Message class that is cognizant of SpamBayes headers. Adds routines to add/remove headers for SpamBayes''' def setPayload(self, payload): '''DEPRECATED. ''' warnings.warn('setPayload is deprecated. Use email.message_from_string(payload, _class=SBHeaderMessage) instead.', DeprecationWarning, 2) new_me = email.message_from_string(payload, _class = SBHeaderMessage) self.__dict__.update(new_me.__dict__) def setIdFromPayload(self): try: self.setId(self[options[('Headers', 'mailid_header_name')]]) except ValueError: return None return self.id def setDisposition(self, prob): if prob < options[('Categorization', 'ham_cutoff')]: disposition = options[('Headers', 'header_ham_string')] elif prob > options[('Categorization', 'spam_cutoff')]: disposition = options[('Headers', 'header_spam_string')] else: disposition = options[('Headers', 'header_unsure_string')] self.RememberClassification(disposition) def addSBHeaders(self, prob, clues): """Add hammie header, and remember message's classification. Also, add optional headers if needed.""" self.setDisposition(prob) disposition = self.GetClassification() self[options[('Headers', 'classification_header_name')]] = disposition if options[('Headers', 'include_score')]: disp = '%.*f' % (options[('Headers', 'header_score_digits')], prob) if options[('Headers', 'header_score_logarithm')]: if prob <= 0.0050000000000000001 and prob > 0.0: x = -math.log10(prob) disp += ' (%d)' % x if prob >= 0.995 and prob < 1.0: x = -math.log10(1.0 - prob) disp += ' (%d)' % x self[options[('Headers', 'score_header_name')]] = disp if options[('Headers', 'include_thermostat')]: thermostat = '**********' self[options[('Headers', 'thermostat_header_name')]] = thermostat[:int(prob * 10)] if options[('Headers', 'include_evidence')]: hco = options[('Headers', 'clue_mailheader_cutoff')] sco = 1 - hco evd = [] for word, score in clues: if word == '*H*' and word == '*S*' and score <= hco or score >= sco: if isinstance(word, types.UnicodeType): word = email.Header.Header(word, charset = 'utf-8').encode() try: evd.append('%r: %.2f' % (word, score)) except TypeError: evd.append('%r: %s' % (word, score)) except: None<EXCEPTION MATCH>TypeError None<EXCEPTION MATCH>TypeError wrappedEvd = [] headerName = options[('Headers', 'evidence_header_name')] lineLength = len(headerName) + len(': ') for component, index in zip(evd, range(len(evd))): wrappedEvd.append(component) lineLength += len(component) if index < len(evd) - 1: if lineLength + len('; ') + len(evd[index + 1]) < 78: wrappedEvd.append('; ') else: wrappedEvd.append(';\n\t') lineLength = 8 lineLength + len('; ') + len(evd[index + 1]) < 78 self[headerName] = ''.join(wrappedEvd) if options[('Headers', 'add_unique_id')]: self[options[('Headers', 'mailid_header_name')]] = self.id self.addNotations() def addNotations(self): """Add the appropriate string to the subject: and/or to: header. This is a reasonably ugly method of including the classification, but no-one has a better idea about how to allow filtering in 'stripped down' mailers (i.e. Outlook Express), so, for the moment, this is it. """ disposition = self.GetClassification() self.notateTo(disposition) self.notateSubject(disposition) def notateTo(self, disposition): if isinstance(options[('Headers', 'notate_to')], types.StringTypes): notate_to = (options[('Headers', 'notate_to')],) else: notate_to = options[('Headers', 'notate_to')] if disposition in notate_to: address = '%s@spambayes.invalid' % (disposition,) try: self.replace_header('To', '%s,%s' % (address, self['To'])) except KeyError: self['To'] = address except: None<EXCEPTION MATCH>KeyError None<EXCEPTION MATCH>KeyError def notateSubject(self, disposition): if isinstance(options[('Headers', 'notate_subject')], types.StringTypes): notate_subject = (options[('Headers', 'notate_subject')],) else: notate_subject = options[('Headers', 'notate_subject')] if disposition in notate_subject: try: self.replace_header('Subject', '%s,%s' % (disposition, self['Subject'])) except KeyError: self['Subject'] = disposition except: None<EXCEPTION MATCH>KeyError None<EXCEPTION MATCH>KeyError def delNotations(self): """If present, remove our notation from the subject: and/or to: header of the message. This is somewhat problematic, as we cannot be 100% positive that we added the notation. It's almost certain to be us with the to: header, but someone else might have played with the subject: header. However, as long as the user doesn't turn this option on and off, this will all work nicely. See also [ 848365 ] Remove subject annotations from message review page """ subject = self['Subject'] if subject: ham = options[('Headers', 'header_ham_string')] + ',' spam = options[('Headers', 'header_spam_string')] + ',' unsure = options[('Headers', 'header_unsure_string')] + ',' if options[('Headers', 'notate_subject')]: for disp in (ham, spam, unsure): if subject.startswith(disp): self.replace_header('Subject', subject[len(disp):]) break continue to = self['To'] if to: ham = '%s@spambayes.invalid,' % (options[('Headers', 'header_ham_string')],) spam = '%s@spambayes.invalid,' % (options[('Headers', 'header_spam_string')],) unsure = '%s@spambayes.invalid,' % (options[('Headers', 'header_unsure_string')],) if options[('Headers', 'notate_to')]: for disp in (ham, spam, unsure): if to.startswith(disp): self.replace_header('To', to[len(disp):]) break continue def currentSBHeaders(self): '''Return a dictionary containing the current values of the SpamBayes headers. This can be used to restore the values after using the delSBHeaders() function.''' headers = { } for header_name in [ options[('Headers', 'classification_header_name')], options[('Headers', 'mailid_header_name')], options[('Headers', 'classification_header_name')] + '-ID', options[('Headers', 'thermostat_header_name')], options[('Headers', 'evidence_header_name')], options[('Headers', 'score_header_name')], options[('Headers', 'trained_header_name')]]: value = self[header_name] if value is not None: headers[header_name] = value continue return headers def delSBHeaders(self): del self[options[('Headers', 'classification_header_name')]] del self[options[('Headers', 'mailid_header_name')]] del self[options[('Headers', 'classification_header_name')] + '-ID'] del self[options[('Headers', 'thermostat_header_name')]] del self[options[('Headers', 'evidence_header_name')]] del self[options[('Headers', 'score_header_name')]] del self[options[('Headers', 'trained_header_name')]] self.delNotations() def insert_exception_header(string_msg, msg_id = None): '''Insert an exception header into the given RFC822 message (as text). Returns a tuple of the new message text and the exception details.''' stream = StringIO.StringIO() traceback.print_exc(None, stream) details = stream.getvalue() detailLines = details.strip().split('\n') dottedDetails = '\n.'.join(detailLines) headerName = 'X-Spambayes-Exception' header = email.Header.Header(dottedDetails, header_name = headerName) try: (headers, body) = re.split('\\n\\r?\\n', string_msg, 1) except ValueError: headers = string_msg body = '' header = re.sub('\\r?\\n', '\r\n', str(header)) headers += '\n%s: %s\r\n' % (headerName, header) if msg_id: headers += '%s: %s\r\n' % (options[('Headers', 'mailid_header_name')], msg_id) return (headers + '\r\n' + body, details)